Code
library(ggplot2)
library(MASS)
library(reshape2)
library(reshape)
library(patchwork)
library(ggpubr)
library(plotly)
library(dplyr)
library(readr)
library(crayon)
library(pheatmap)Welcome to my presentation! Today I’m going to showcase what I’ve been doing over the summer. This is the culmination of most of my work, I welcome and encourage feedback before my poster presentation!
library(ggplot2)
library(MASS)
library(reshape2)
library(reshape)
library(patchwork)
library(ggpubr)
library(plotly)
library(dplyr)
library(readr)
library(crayon)
library(pheatmap)compare_vcfs = function(quilt_input,geno_input,census_input,x_order,base=FALSE){
census_input$V1 = sub("plexWell-", "", census_input$V1)
census_input$V1 = sub("_GT23-023.._.*", "", census_input$V1)
quilt_input$DONOR = sub("plexWell-", "", quilt_input$DONOR)
quilt_input$DONOR = sub("_GT23-023.._.*", "", quilt_input$DONOR)
geno_input$DONOR = sub("plexWell-", "", geno_input$DONOR)
geno_input$DONOR = sub("_GT23-023.._.*", "", geno_input$DONOR)
census_input = census_input[order(census_input$V1),]
quilt_input$READS = census_input$V2
meltedbar = subset(quilt_input, select = c(DONOR,READS,COUNT))
colnames(meltedbar) <- c('DONOR','Failed','Passed')
meltedbar = melt(meltedbar,id="DONOR")
colnames(meltedbar) <- c('Donor','Filtered','Reads')
p1 <- ggplot(data=meltedbar, aes(x=Donor,y=Reads, fill=Filtered)) +
geom_bar(position="identity",stat="identity") +
scale_fill_manual(values=c("#D41159", "#69C561")) +
scale_x_discrete(limits = x_order) +
theme(
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
panel.background = element_rect(fill='transparent'),
plot.background = element_rect(fill='transparent', color=NA),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
legend.background = element_rect(fill='transparent'),
legend.box.background = element_rect(fill='transparent')
) +
labs(y = "Total Reads") +
guides(fill=guide_legend(title="Reads QC"))
if (base == FALSE){
p1 <- p1 + theme(axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
axis.title.x =element_blank())
}
quilt_input$G_REP = geno_input$REPRESENTATION
quilt_input$Genoprobs = quilt_input$G_REP - quilt_input$KNOWN
quilt_input$Quilt = quilt_input$REPRESENTATION - quilt_input$KNOWN
meltedline = subset(quilt_input, select = c(DONOR,Quilt,Genoprobs))
meltedline = melt(meltedline,id="DONOR")
colnames(meltedline) <- c('Donor','VCF','Differences')
p2 <- ggplot(data = meltedline) +
geom_hline(yintercept=0,size=0.5)+
geom_line(aes(x = Donor, y = Differences, colour = VCF, group = VCF)) +
geom_point(aes(x = Donor, y = Differences, colour = VCF, group = VCF))+
scale_x_discrete(limits = x_order) +
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank()) +
scale_color_manual(values=c('#063970', '#A69943')) +
theme(
panel.background = element_rect(fill='transparent'),
plot.background = element_rect(fill='transparent', color=NA),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
legend.background = element_rect(fill='transparent'),
legend.box.background = element_rect(fill='transparent')
) + labs(y = "∆ Proportion")
quilt_input$Genoprobs = ((quilt_input$G_REP - quilt_input$KNOWN) / quilt_input$KNOWN) * 100
quilt_input$Quilt = ((quilt_input$REPRESENTATION - quilt_input$KNOWN) / quilt_input$KNOWN) * 100
meltedline_error = subset(quilt_input, select = c(DONOR,Quilt,Genoprobs))
meltedline_error = melt(meltedline_error,id="DONOR")
colnames(meltedline_error) <- c('Donor','VCF','PercentError')
p3 <- ggplot(data = meltedline_error) +
geom_hline(yintercept=0,size=0.5)+
geom_line(aes(x = Donor, y = PercentError, colour = VCF, group = VCF)) +
geom_point(aes(x = Donor, y = PercentError, colour = VCF, group = VCF))+
scale_x_discrete(limits = x_order) +
scale_color_manual(values=c('#063970', '#A69943')) +
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank()) +
theme(
panel.background = element_rect(fill='transparent'),
plot.background = element_rect(fill='transparent', color=NA),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
legend.background = element_rect(fill='transparent'),
legend.box.background = element_rect(fill='transparent')
) + labs(y = "Percent Error")
stacked_plots <- p2 + p3 + p1 + plot_layout(ncol = 1)
stacked_plots
}log_census = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/sixteen_vcfs_sample/log_census_report.tsv",header=FALSE,sep="\t")
log_census$V1 = sub("plexWell-", "", log_census$V1)
log_census$V1 = sub("_GT23-023.._.*", "", log_census$V1)
hold = log_census[order(log_census$V2),]
hold = hold$V1Sixteen samples were randomly chosen from the pool of forty-eight and were proportioned in equal, logarithmic, and “random” proportions. Each bar represents the total reads a sample contributed to the pool of three million reads, with magenta reads failing the quality control. Above the difference in estimation versus true proportion for each VCF and their corresponding percent error is graphed.
equal_quilt = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/sixteen_vcfs_sample/equal_quilt.census.txt",header=TRUE,sep="\t",skip=2)
equal_geno = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/sixteen_vcfs_sample/equal_geno.census.txt",header=TRUE,sep="\t",skip=2)
equal_census = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/sixteen_vcfs_sample/equal_census_report.tsv",header=FALSE,sep="\t")
compare_vcfs(equal_quilt,equal_geno,equal_census,hold)Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.
ggsave("images/equal_proportions.png")Saving 7 x 5 in image
log_quilt = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/sixteen_vcfs_sample/log_quilt.census.txt",header=TRUE,sep="\t",skip=2)
log_geno = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/sixteen_vcfs_sample/log_geno.census.txt",header=TRUE,sep="\t",skip=2)
log_census = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/sixteen_vcfs_sample/log_census_report.tsv",header=FALSE,sep="\t")
compare_vcfs(log_quilt,log_geno,log_census,hold)ggsave("images/log_proportions.png")Saving 7 x 5 in image
random_quilt = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/sixteen_vcfs_sample/random_quilt.census.txt",header=TRUE,sep="\t",skip=2)
random_geno = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/sixteen_vcfs_sample/random_geno.census.txt",header=TRUE,sep="\t",skip=2)
random_census = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/sixteen_vcfs_sample/random_census_report.tsv",header=FALSE,sep="\t")
compare_vcfs(random_quilt,random_geno,random_census,hold,base=TRUE)ggsave("images/random_proportions.png")Saving 7 x 5 in image
# library(reticulate)
# conda_create(envname = "myenv")
# conda_install( envname = "myenv", packages = c("seaborn","pandas","matplotlib","plotly"))
# use_condaenv(condaenv = "myenv",required = TRUE )sample_amounts <- c()
read_numbers <- c()
percent_errors <- c()
proportion_types <- c()
calculate_pe <- function(df) {
df$PERCENT_ERROR <- abs(df$REPRESENTATION - df$KNOWN) / df$KNOWN * 100
av_percent_error <- mean(df$PERCENT_ERROR)
return(av_percent_error)
}
files <- list.files(path = "/projects/munger-lab/projects/census_seq_SSP2024/genoprobs_runs/", pattern = "my.census.txt", recursive = TRUE, full.names = TRUE)
for (file_path in files) {
path_parts <- strsplit(file_path, "/")[[1]]
sample_amount <- path_parts[length(path_parts) - 4]
proportion_type <- path_parts[length(path_parts) - 3]
read_number <- path_parts[length(path_parts) - 2]
df <- read_tsv(file_path, skip = 2)
percent_error <- calculate_pe(df)
sample_amounts <- c(sample_amounts, sample_amount)
read_numbers <- c(read_numbers, read_number)
percent_errors <- c(percent_errors, percent_error)
proportion_types <- c(proportion_types, proportion_type)
}
percent_errors <- as.numeric(percent_errors)
reads_to_numbers <- list(
'h_thousand' = 100000,
'o_million' = 1000000,
't_million' = 10000000,
'f_million' = 50000000,
'h_million' = 100000000
)
read_numbers <- sapply(read_numbers, function(x) reads_to_numbers[[x]])
samples_to_numbers <- list(
's4' = 4,
's8' = 8,
's16' = 16,
's32' = 32,
's48' = 48
)
sample_amounts <- sapply(sample_amounts, function(x) samples_to_numbers[[x]])
df <- data.frame(
'Reads' = read_numbers,
'Samples' = sample_amounts,
'Percent_Error' = percent_errors,
'Proportion_Type' = proportion_types
)
fig <- plot_ly(df, x = ~Samples, y = ~Reads, z = ~Percent_Error, color = ~Proportion_Type, colors = c('#009E73','#56B4E9','#F0E442'))
fig <- fig %>% add_markers(marker = list(opacity = 0.5))
fig <- fig %>% layout(scene = list(xaxis = list(title = 'Number of Samples'),
yaxis = list(title = 'Number of Reads'),
zaxis = list(title = 'Percent Error')))
figlibrary(dplyr)
library(readr)
library(plotly)
read_numbers <- c()
calculate_pe <- function(df) {
df$PERCENT_ERROR <- abs(df$REPRESENTATION - df$KNOWN) / df$KNOWN * 100
av_percent_error <- mean(df$PERCENT_ERROR)
return(av_percent_error)
}
files <- list.files(path = "/projects/munger-lab/projects/census_seq_SSP2024/genoprobs_runs/s48/random/", pattern = "my.census.txt", recursive = TRUE, full.names = TRUE)
results <- data.frame(Proportion = numeric(), Percent_Error = numeric(), Reads = character())
for (file_path in files) {
path_parts <- strsplit(file_path, "/")[[1]]
read_number <- path_parts[length(path_parts) - 2]
run_number <- path_parts[length(path_parts) - 1]
df <- read_tsv(file_path, skip = 2)
df <- df %>%
mutate(DONOR = paste0(DONOR, "_", run_number),
Percent_Error = abs(REPRESENTATION - KNOWN) / KNOWN * 100,
Reads = read_number)
results <- rbind(results, df)
}
reads_to_numbers <- list(
'h_thousand' = 100000,
'o_million' = 1000000,
't_million' = 10000000,
'f_million' = 50000000,
'h_million' = 100000000
)
results$Reads <- sapply(results$Reads, function(x) reads_to_numbers[[x]])
results$Reads <- as.integer(results$Reads)
results$DONOR <- as.factor(results$DONOR)
fig <- results %>%
plot_ly(
x = ~KNOWN,
y = ~Percent_Error,
frame = ~Reads,
text = ~DONOR,
hoverinfo = "text",
type = 'scatter',
mode = 'markers'
)
figlibrary(dplyr)
library(readr)
library(plotly)
library(gapminder)
read_numbers <- c()
calculate_pe <- function(df) {
df$PERCENT_ERROR <- abs(df$REPRESENTATION - df$KNOWN) / df$KNOWN * 100
av_percent_error <- mean(df$PERCENT_ERROR)
return(av_percent_error)
}
files <- list.files(path = "/projects/munger-lab/projects/census_seq_SSP2024/genoprobs_runs/s48/log/", pattern = "my.census.txt", recursive = TRUE, full.names = TRUE)
results <- data.frame(Proportion = numeric(), Percent_Error = numeric(), Reads = character())
for (file_path in files) {
path_parts <- strsplit(file_path, "/")[[1]]
read_number <- path_parts[length(path_parts) - 2]
run_number <- path_parts[length(path_parts) - 1]
df <- read_tsv(file_path, skip = 2)
df <- df %>%
mutate(DONOR = paste0(DONOR, "_", run_number),
Percent_Error = abs(REPRESENTATION - KNOWN) / KNOWN * 100,
Reads = read_number)
results <- rbind(results, df)
}
reads_to_numbers <- list(
'h_thousand' = 100000,
'o_million' = 1000000,
't_million' = 10000000,
'f_million' = 50000000,
'h_million' = 100000000
)
results$Reads <- sapply(results$Reads, function(x) reads_to_numbers[[x]])
results$Reads <- as.integer(results$Reads)
results$DONOR <- as.factor(results$DONOR)
fig <- results %>%
plot_ly(
x = ~KNOWN,
y = ~Percent_Error,
frame = ~Reads,
text = ~DONOR,
hoverinfo = "text",
type = 'scatter',
mode = 'markers'
)
figbowtie = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test/bowtie.census.txt",sep="\t",skip=2,header=TRUE)
mini = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test/mini.census.txt",sep="\t",skip=2,header=TRUE)
bwa = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test/bwa.census.txt",sep="\t",skip=2,header=TRUE)
STAR = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test/STAR.census.txt",sep="\t",skip=2,header=TRUE)
bwa$STAR = STAR$REPRESENTATION
bwa$minimap = mini$REPRESENTATION
bwa$bwa = bwa$REPRESENTATION
bwa$bowtie2 = bowtie$REPRESENTATION
bwa$STAR = (abs(bwa$STAR - bwa$KNOWN) / bwa$KNOWN) * 100
bwa$minimap = (abs(bwa$mini - bwa$KNOWN) / bwa$KNOWN) * 100
bwa$bwa = (abs(bwa$bwa - bwa$KNOWN) / bwa$KNOWN) * 100
bwa$bowtie2 = (abs(bwa$bowtie2 - bwa$KNOWN) / bwa$KNOWN) * 100
meltedviolin = subset(bwa, select = c(DONOR,bwa,bowtie2,minimap,STAR))
meltedviolin = melt(meltedviolin,id="DONOR")
colnames(meltedviolin) <- c('Donor','Aligner','PercentError')
ggplot(meltedviolin, aes(x=Aligner,y=PercentError,fill=Aligner)) +
geom_violin() +
scale_fill_brewer(palette="Dark2") +
geom_dotplot(binaxis='y',stackdir = 'center',dotsize=1,color="black") +
labs(title="Aligner Type versus Percent Error",x="Aligner", y = "Percent Error") +
theme_minimal() bowtie = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test_48/bowtie.census.txt",sep="\t",skip=2,header=TRUE)
mini = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test_48/mini.census.txt",sep="\t",skip=2,header=TRUE)
bwa = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test_48/bwa.census.txt",sep="\t",skip=2,header=TRUE)
STAR = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test_48/STAR.census.txt",sep="\t",skip=2,header=TRUE)
bwa$STAR = STAR$REPRESENTATION
bwa$minimap = mini$REPRESENTATION
bwa$bwa = bwa$REPRESENTATION
bwa$bowtie2 = bowtie$REPRESENTATION
bwa$STAR = ((bwa$STAR - bwa$KNOWN) / bwa$KNOWN) * 100
bwa$minimap = ((bwa$mini - bwa$KNOWN) / bwa$KNOWN) * 100
bwa$bwa = ((bwa$bwa - bwa$KNOWN) / bwa$KNOWN) * 100
bwa$bowtie2 = ((bwa$bowtie2 - bwa$KNOWN) / bwa$KNOWN) * 100
meltedviolin = subset(bwa, select = c(DONOR,bwa,minimap,bowtie2,STAR))
meltedviolin = melt(meltedviolin,id="DONOR")
colnames(meltedviolin) <- c('Donor','Aligner','PercentError')
ggplot(meltedviolin, aes(x=Aligner,y=PercentError,fill=Aligner)) +
geom_abline(slope=0, intercept=0,colour='#000000') +
geom_violin() +
scale_fill_brewer(palette="Dark2") +
geom_dotplot(binaxis='y',stackdir = 'center',dotsize=1,color="black") +
labs(title="Aligner Type versus Percent Error",x="Aligner", y = "Percent Error") +
theme_minimal() +
theme(legend.position="none")ggsave("images/diff_aligners.png")bowtie = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test/bowtie.census.txt",sep="\t",skip=2,header=TRUE)
mini = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test/mini.census.txt",sep="\t",skip=2,header=TRUE)
bwa = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test/bwa.census.txt",sep="\t",skip=2,header=TRUE)
STAR = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test/STAR.census.txt",sep="\t",skip=2,header=TRUE)
bwa$STAR = STAR$REPRESENTATION
bwa$mini = mini$REPRESENTATION
bwa$bwa = bwa$REPRESENTATION
bwa$bowtie = bowtie$REPRESENTATION
bwa$STAR = ((bwa$STAR - bwa$KNOWN) / bwa$KNOWN) * 100
bwa$mini = ((bwa$mini - bwa$KNOWN) / bwa$KNOWN) * 100
bwa$bwa = ((bwa$bwa - bwa$KNOWN) / bwa$KNOWN) * 100
bwa$bowtie = ((bwa$bowtie - bwa$KNOWN) / bwa$KNOWN) * 100
bwa$DONOR = sub("plexWell-", "", bwa$DONOR)
bwa$DONOR = sub("_GT23-023.._.*", "", bwa$DONOR)
meltedviolin = subset(bwa, select = c(DONOR,bwa,bowtie,mini,STAR))
df = as.data.frame(meltedviolin)
rownames(df) = df$DONOR
df$DONOR = NULL
pheatmap(df)bowtie = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test_48/bowtie.census.txt",sep="\t",skip=2,header=TRUE)
mini = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test_48/mini.census.txt",sep="\t",skip=2,header=TRUE)
bwa = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test_48/bwa.census.txt",sep="\t",skip=2,header=TRUE)
STAR = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test_48/STAR.census.txt",sep="\t",skip=2,header=TRUE)
bwa$STAR = STAR$REPRESENTATION
bwa$mini = mini$REPRESENTATION
bwa$bwa = bwa$REPRESENTATION
bwa$bowtie = bowtie$REPRESENTATION
bwa$STAR = ((bwa$STAR - bwa$KNOWN) / bwa$KNOWN) * 100
bwa$mini = ((bwa$mini - bwa$KNOWN) / bwa$KNOWN) * 100
bwa$bwa = ((bwa$bwa - bwa$KNOWN) / bwa$KNOWN) * 100
bwa$bowtie = ((bwa$bowtie - bwa$KNOWN) / bwa$KNOWN) * 100
bwa$DONOR = sub("plexWell-", "", bwa$DONOR)
bwa$DONOR = sub("_GT23-023.._.*", "", bwa$DONOR)
meltedviolin = subset(bwa, select = c(DONOR,bwa,bowtie,mini,STAR))
df = as.data.frame(meltedviolin)
rownames(df) = df$DONOR
df$DONOR = NULL
p1 <- pheatmap(df,show_colnames = FALSE,fontsize_row = 6)save_pheatmap_pdf <- function(x, filename, width=10.5, height=6.5) {
stopifnot(!missing(x))
stopifnot(!missing(filename))
pdf(filename, width=width, height=height)
grid::grid.newpage()
grid::grid.draw(x$gtable)
dev.off()
}
save_pheatmap_pdf(p1, "images/pheatmap2.pdf")png
2
bowtie = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test/bowtie.census.txt",sep="\t",skip=2,header=TRUE)
mini = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test/mini.census.txt",sep="\t",skip=2,header=TRUE)
bwa = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test/bwa.census.txt",sep="\t",skip=2,header=TRUE)
STAR = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test/STAR.census.txt",sep="\t",skip=2,header=TRUE)
bwa$STAR = STAR$REPRESENTATION - STAR$KNOWN
bwa$mini = mini$REPRESENTATION - mini$KNOWN
bwa$bwa = bwa$REPRESENTATION - bwa$KNOWN
bwa$bowtie = bowtie$REPRESENTATION - bowtie$KNOWN
bwa$DONOR = sub("plexWell-", "", bwa$DONOR)
bwa$DONOR = sub("_GT23-023.._.*", "", bwa$DONOR)
meltedviolin = subset(bwa, select = c(DONOR,bwa,bowtie,mini,STAR))
df = as.data.frame(meltedviolin)
rownames(df) = df$DONOR
df$DONOR = NULL
pheatmap(df)files <- list.files(path = "/projects/munger-lab/projects/census_seq_SSP2024/replication_runs/tiny_prop_one", pattern = "my.census.txt", recursive = TRUE, full.names = TRUE)
dirs <- dirname(files)
merged_dfs <- list()
for (dir in dirs) {
# Construct the file paths
file_path_txt <- file.path(dir, "my.census.txt")
file_path_tsv <- file.path(dir, "census_report.tsv")
df_txt <- read_tsv(file_path_txt, skip = 2)
df_tsv <- read_tsv(file_path_tsv, col_names=FALSE)
merged_df <- merge(df_txt, df_tsv, by.x = "DONOR", by.y = "X1", all.x = TRUE)
merged_df$X2 = merged_df$X2 / 1000000
merged_dfs[[length(merged_dfs) + 1]] <- merged_df
}
final_df <- do.call(rbind, merged_dfs)
final_df <- final_df[-which(final_df$X2 > 0.1),]
p1 <- ggplot(final_df, aes(x = X2, y = REPRESENTATION)) +
geom_point(shape=1) +
geom_abline(slope=1, intercept=0,colour='#E41A1C') +
scale_x_continuous(name="Donor Fraction of village (in silico mixture)", limits=c(0, 0.016)) +
scale_y_continuous(name="Donor Fraction of village (inferred by Census-seq)", limits=c(0, 0.016)) +
theme_minimal()
final_df$Percent_Error = abs(final_df$REPRESENTATION - final_df$X2) / final_df$X2 * 100
final_df_one = final_df %>%
group_by(X2) %>%
mutate(Median_Percent_Error = median(Percent_Error))
p2 <- ggplot(final_df_one, aes(x = X2, y = Median_Percent_Error)) +
geom_point() +
geom_line() +
scale_x_continuous(name="Donor Fraction of village (in silico mixture)", limits=c(0, 0.0105)) +
scale_y_continuous(name="Median Percent Error", limits=c(0, 100)) +
theme_minimal()
p1 + p2files <- list.files(path = "/projects/munger-lab/projects/census_seq_SSP2024/replication_runs/tiny_prop_one", pattern = "my.census.txt", recursive = TRUE, full.names = TRUE)
dirs <- dirname(files)
merged_dfs <- list()
for (dir in dirs) {
# Construct the file paths
file_path_txt <- file.path(dir, "my.census.txt")
file_path_tsv <- file.path(dir, "census_report.tsv")
df_txt <- read_tsv(file_path_txt, skip = 2)
df_tsv <- read_tsv(file_path_tsv, col_names=FALSE)
merged_df <- merge(df_txt, df_tsv, by.x = "DONOR", by.y = "X1", all.x = TRUE)
merged_df$X2 = merged_df$X2 / 1000000
merged_dfs[[length(merged_dfs) + 1]] <- merged_df
}
final_df <- do.call(rbind, merged_dfs)
final_df <- final_df[-which(final_df$X2 > 0.1),]
p1 <- ggplot(final_df, aes(x = KNOWN, y = REPRESENTATION)) +
geom_point(shape=1) +
geom_abline(slope=1, intercept=0,colour='#E41A1C') +
scale_x_continuous(name="Donor Fraction of village (passed reads)", limits=c(0, 0.016)) +
scale_y_continuous(name="Donor Fraction of village (inferred by Census-seq)", limits=c(0, 0.016)) +
theme_minimal()
final_df$Percent_Error = abs(final_df$REPRESENTATION - final_df$KNOWN) / final_df$KNOWN * 100
final_df = final_df %>%
group_by(KNOWN) %>%
mutate(Median_Percent_Error = median(Percent_Error))
p2 <- ggplot(final_df, aes(x = KNOWN, y = Median_Percent_Error)) +
geom_point() +
geom_line() +
scale_x_continuous(name="Donor Fraction of village (passed reads)", limits=c(0, 0.0105)) +
scale_y_continuous(name="Median Percent Error", limits=c(0, 100)) +
theme_minimal()
p1 + p2files <- list.files(path = "/projects/munger-lab/projects/census_seq_SSP2024/replication_runs/tiny_prop_ten", pattern = "my.census.txt", recursive = TRUE, full.names = TRUE)
dirs <- dirname(files)
merged_dfs <- list()
for (dir in dirs) {
# Construct the file paths
file_path_txt <- file.path(dir, "my.census.txt")
file_path_tsv <- file.path(dir, "census_report.tsv")
df_txt <- read_tsv(file_path_txt, skip = 2)
df_tsv <- read_tsv(file_path_tsv, col_names=FALSE)
merged_df <- merge(df_txt, df_tsv, by.x = "DONOR", by.y = "X1", all.x = TRUE)
merged_df$X2 = merged_df$X2 / 10000000
merged_dfs[[length(merged_dfs) + 1]] <- merged_df
}
final_df <- do.call(rbind, merged_dfs)
final_df <- final_df[-which(final_df$X2 > 0.1),]
p1 <- ggplot(final_df, aes(x = X2, y = REPRESENTATION)) +
geom_point(shape=1) +
geom_abline(slope=1, intercept=0,colour='#E41A1C') +
scale_x_continuous(name="Donor Fraction of village (in silico mixture)", limits=c(0, 0.016)) +
scale_y_continuous(name="Donor Fraction of village (inferred by Census-seq)", limits=c(0, 0.016)) +
theme_minimal()
final_df$Percent_Error = abs(final_df$REPRESENTATION - final_df$X2) / final_df$X2 * 100
final_df_ten = final_df %>%
group_by(X2) %>%
mutate(Median_Percent_Error = median(Percent_Error))
p2 <- ggplot(final_df_ten, aes(x = X2, y = Median_Percent_Error)) +
geom_point() +
geom_line() +
scale_x_continuous(name="Donor Fraction of village (in silico mixture)", limits=c(0, 0.0105)) +
scale_y_continuous(name="Median Percent Error", limits=c(0, 100)) +
theme_minimal()
p1 + p2files <- list.files(path = "/projects/munger-lab/projects/census_seq_SSP2024/replication_runs/tiny_prop_ten", pattern = "my.census.txt", recursive = TRUE, full.names = TRUE)
dirs <- dirname(files)
merged_dfs <- list()
for (dir in dirs) {
# Construct the file paths
file_path_txt <- file.path(dir, "my.census.txt")
file_path_tsv <- file.path(dir, "census_report.tsv")
df_txt <- read_tsv(file_path_txt, skip = 2)
df_tsv <- read_tsv(file_path_tsv, col_names=FALSE)
merged_df <- merge(df_txt, df_tsv, by.x = "DONOR", by.y = "X1", all.x = TRUE)
merged_df$X2 = merged_df$X2 / 10000000
merged_dfs[[length(merged_dfs) + 1]] <- merged_df
}
final_df <- do.call(rbind, merged_dfs)
final_df <- final_df[-which(final_df$X2 > 0.1),]
p1 <- ggplot(final_df, aes(x = KNOWN, y = REPRESENTATION)) +
geom_point(shape=1) +
geom_abline(slope=1, intercept=0,colour='#E41A1C') +
scale_x_continuous(name="Donor Fraction of village (passed reads)", limits=c(0, 0.016)) +
scale_y_continuous(name="Donor Fraction of village (inferred by Census-seq)", limits=c(0, 0.016)) +
theme_minimal()
final_df$Percent_Error = abs(final_df$REPRESENTATION - final_df$KNOWN) / final_df$KNOWN * 100
final_df = final_df %>%
group_by(KNOWN) %>%
mutate(Median_Percent_Error = median(Percent_Error))
p2 <- ggplot(final_df, aes(x = KNOWN, y = Median_Percent_Error)) +
geom_point() +
geom_line() +
scale_x_continuous(name="Donor Fraction of village (passed reads)", limits=c(0, 0.0105)) +
scale_y_continuous(name="Median Percent Error", limits=c(0, 100)) +
theme_minimal()
p1 + p2files <- list.files(path = "/projects/munger-lab/projects/census_seq_SSP2024/replication_runs/tiny_prop_hundred", pattern = "my.census.txt", recursive = TRUE, full.names = TRUE)
dirs <- dirname(files)
merged_dfs <- list()
for (dir in dirs) {
# Construct the file paths
file_path_txt <- file.path(dir, "my.census.txt")
file_path_tsv <- file.path(dir, "census_report.tsv")
df_txt <- read_tsv(file_path_txt, skip = 2)
df_tsv <- read_tsv(file_path_tsv, col_names=FALSE)
merged_df <- merge(df_txt, df_tsv, by.x = "DONOR", by.y = "X1", all.x = TRUE)
merged_df$X2 = merged_df$X2 / 100000000
merged_dfs[[length(merged_dfs) + 1]] <- merged_df
}
final_df <- do.call(rbind, merged_dfs)
final_df <- final_df[-which(final_df$X2 > 0.1),]
p1 <- ggplot(final_df, aes(x = X2, y = REPRESENTATION)) +
geom_point(shape=1) +
geom_abline(slope=1, intercept=0,colour='#E41A1C') +
scale_x_continuous(name="Donor Fraction of village (in silico mixture)", limits=c(0, 0.016)) +
scale_y_continuous(name="Donor Fraction of village (inferred by Census-seq)", limits=c(0, 0.016)) +
theme_minimal()
final_df$Percent_Error = abs(final_df$REPRESENTATION - final_df$X2) / final_df$X2 * 100
final_df_hundred = final_df %>%
group_by(X2) %>%
mutate(Median_Percent_Error = median(Percent_Error))
p2 <- ggplot(final_df_hundred, aes(x = X2, y = Median_Percent_Error)) +
geom_point() +
geom_line() +
scale_x_continuous(name="Donor Fraction of village (in silico mixture)", limits=c(0, 0.0105)) +
scale_y_continuous(name="Median Percent Error", limits=c(0, 100)) +
theme_minimal()
p1 + p2files <- list.files(path = "/flashscratch/munger-rea/tiny_prop_hundred", pattern = "my.census.txt", recursive = TRUE, full.names = TRUE)
dirs <- dirname(files)
merged_dfs <- list()
for (dir in dirs) {
# Construct the file paths
file_path_txt <- file.path(dir, "my.census.txt")
file_path_tsv <- file.path(dir, "census_report.tsv")
df_txt <- read_tsv(file_path_txt, skip = 2)
df_tsv <- read_tsv(file_path_tsv, col_names=FALSE)
merged_df <- merge(df_txt, df_tsv, by.x = "DONOR", by.y = "X1", all.x = TRUE)
merged_df$X2 = merged_df$X2 / 100000000
merged_dfs[[length(merged_dfs) + 1]] <- merged_df
}
final_df <- do.call(rbind, merged_dfs)
final_df <- final_df[-which(final_df$X2 > 0.1),]
p1 <- ggplot(final_df, aes(x = KNOWN, y = REPRESENTATION)) +
geom_point(shape=1) +
geom_abline(slope=1, intercept=0,colour='#E41A1C') +
scale_x_continuous(name="Donor Fraction of village (passed reads)", limits=c(0, 0.016)) +
scale_y_continuous(name="Donor Fraction of village (inferred by Census-seq)", limits=c(0, 0.016)) +
theme_minimal()
final_df$Percent_Error = abs(final_df$REPRESENTATION - final_df$KNOWN) / final_df$KNOWN * 100
final_df = final_df %>%
group_by(KNOWN) %>%
mutate(Median_Percent_Error = median(Percent_Error))
p2 <- ggplot(final_df, aes(x = KNOWN, y = Median_Percent_Error)) +
geom_point() +
geom_line() +
scale_x_continuous(name="Donor Fraction of village (passed reads)", limits=c(0, 0.0105)) +
scale_y_continuous(name="Median Percent Error", limits=c(0, 100)) +
theme_minimal()
p1 + p2files <- list.files(path = "/projects/munger-lab/projects/census_seq_SSP2024/replication_runs/tiny_prop_hundred", pattern = "my.census.txt", recursive = TRUE, full.names = TRUE)
dirs <- dirname(files)
merged_dfs <- list()
for (dir in dirs) {
# Construct the file paths
file_path_txt <- file.path(dir, "my.census.txt")
file_path_tsv <- file.path(dir, "census_report.tsv")
df_txt <- read_tsv(file_path_txt, skip = 2)
df_tsv <- read_tsv(file_path_tsv, col_names=FALSE)
merged_df <- merge(df_txt, df_tsv, by.x = "DONOR", by.y = "X1", all.x = TRUE)
merged_df$X2 = merged_df$X2 / 100000000
merged_dfs[[length(merged_dfs) + 1]] <- merged_df
}
final_df <- do.call(rbind, merged_dfs)
final_df <- final_df[-which(final_df$X2 > 0.1),]
p1 <- ggplot(final_df, aes(x = X2, y = REPRESENTATION)) +
geom_point(shape=1) +
geom_abline(slope=1, intercept=0,colour='#E41A1C') +
scale_x_continuous(name="Donor Fraction of village (in silico mixture)", limits=c(0, 0.016)) +
scale_y_continuous(name="Donor Fraction of village (inferred by Census-seq)", limits=c(0, 0.016)) +
theme_minimal()
final_df$Percent_Error = abs(final_df$REPRESENTATION - final_df$X2) / final_df$X2 * 100
final_df_hundred = final_df %>%
group_by(X2) %>%
mutate(Median_Percent_Error = median(Percent_Error))
p2 <- ggplot(final_df_hundred, aes(x = X2, y = Median_Percent_Error)) +
geom_point() +
geom_line() +
scale_x_continuous(name="Donor Fraction of village (in silico mixture)", limits=c(0, 0.0105)) +
scale_y_continuous(name="Median Percent Error", limits=c(0, 100)) +
theme_minimal()
p1 + p2coverage_graphs = function(paths,coverage){
files <- list.files(path = paths, pattern = "my.census.txt", recursive = TRUE, full.names = TRUE)
dirs <- dirname(files)
merged_dfs <- list()
for (dir in dirs) {
# Construct the file paths
file_path_txt <- file.path(dir, "my.census.txt")
file_path_tsv <- file.path(dir, "census_report_final.tsv")
df_txt <- read_tsv(file_path_txt, skip = 2)
df_tsv <- read_tsv(file_path_tsv, col_names=FALSE)
merged_df <- merge(df_txt, df_tsv, by.x = "DONOR", by.y = "X1", all.x = TRUE)
merged_dfs[[length(merged_dfs) + 1]] <- merged_df
}
final_df <- do.call(rbind, merged_dfs)
final_df <- final_df[-which(final_df$X3 > 0.0105),]
final_df$Percent_Error = abs(final_df$REPRESENTATION - final_df$X3) / final_df$X3 * 100
final_df_c = final_df %>%
group_by(X3) %>%
mutate(Median_Percent_Error = median(Percent_Error))
final_df_c$Coverage = coverage
final_df_c = subset(final_df_c, select = c(DONOR,X3,Median_Percent_Error,Coverage))
return(final_df_c)
}df_0.1 <- coverage_graphs("/projects/munger-lab/projects/census_seq_SSP2024/coverage_dir/0.1","0.1x")
df_0.2 <- coverage_graphs("/projects/munger-lab/projects/census_seq_SSP2024/coverage_dir/0.2","0.2x")
df_0.3 <- coverage_graphs("/projects/munger-lab/projects/census_seq_SSP2024/coverage_dir/0.3","0.3x")
df_0.5 <- coverage_graphs("/projects/munger-lab/projects/census_seq_SSP2024/coverage_dir/0.5","0.5x")
df_1 <- coverage_graphs("/projects/munger-lab/projects/census_seq_SSP2024/coverage_dir/1","1x")
df_2 <- coverage_graphs("/projects/munger-lab/projects/census_seq_SSP2024/coverage_dir/2","2x")
df_3 <- coverage_graphs("/projects/munger-lab/projects/census_seq_SSP2024/coverage_dir/3","3x")
df_5 <- coverage_graphs("/projects/munger-lab/projects/census_seq_SSP2024/coverage_dir/5","5x")
df_10 <- coverage_graphs("/projects/munger-lab/projects/census_seq_SSP2024/coverage_dir/10","10x")
df_20 <- coverage_graphs("/projects/munger-lab/projects/census_seq_SSP2024/coverage_dir/20","20x")
df_30 <- coverage_graphs("/projects/munger-lab/projects/census_seq_SSP2024/coverage_dir/30","30x")
mega_merge <- rbind(df_0.1,df_0.2,df_0.3,df_0.5,df_1,df_2,df_3,df_5,df_10,df_20,df_30)
coverage_levels <- c("0.1x", "0.2x", "0.3x","0.5x","1x","2x","3x","5x", "10x", "20x", "30x")
mega_merge$Coverage <- factor(mega_merge$Coverage, levels = coverage_levels)
p2 <- ggplot(mega_merge, aes(x = X3, y = Median_Percent_Error)) +
geom_line(aes(color = Coverage)) +
scale_x_continuous(name="Donor Fraction of village (in silico mixture)", limits=c(0, 0.0105)) +
scale_y_continuous(name="Median Percent Error") +
scale_color_manual(values = c("0.1x" = "#ff0000", "0.2x" = "#ff8700", "0.3x" = "#ffd300", "0.5x" = "#deff0a", "1x" = "#a1ff0a", "2x" = "#0aff99", "3x" = "#0aefff",
"5x" = "#147df5", "10x" = "#580aff", "30x" = "#be0aff", "20x" = "mediumpurple1")) +
geom_point() +
theme_minimal()
p2ggsave("images/coverage.png")files <- list.files(path = "/projects/munger-lab/projects/census_seq_SSP2024/coverage_dir/10", pattern = "my.census.txt", recursive = TRUE, full.names = TRUE)
dirs <- dirname(files)
merged_dfs <- list()
for (dir in dirs) {
# Construct the file paths
file_path_txt <- file.path(dir, "my.census.txt")
file_path_tsv <- file.path(dir, "census_report_final.tsv")
df_txt <- read_tsv(file_path_txt, skip = 2)
df_tsv <- read_tsv(file_path_tsv, col_names=FALSE)
merged_df <- merge(df_txt, df_tsv, by.x = "DONOR", by.y = "X1", all.x = TRUE)
merged_dfs[[length(merged_dfs) + 1]] <- merged_df
}
final_df <- do.call(rbind, merged_dfs)
final_df <- final_df[-which(final_df$X3 > 0.0105),]
p1 <- ggplot(final_df, aes(x = X3, y = REPRESENTATION)) +
geom_point(shape=1) +
geom_abline(slope=1, intercept=0,colour='#E41A1C') +
scale_x_continuous(name="Donor Fraction of village (in silico mixture)", limits=c(0, 0.016)) +
scale_y_continuous(name="Donor Fraction of village (inferred by Census-seq)", limits=c(0, 0.016)) +
theme_minimal()
final_df$Percent_Error = abs(final_df$REPRESENTATION - final_df$X3) / final_df$X3 * 100
final_df_30 = final_df %>%
group_by(X3) %>%
mutate(Median_Percent_Error = median(Percent_Error))
p2 <- ggplot(final_df_30, aes(x = X3, y = Median_Percent_Error)) +
geom_point() +
geom_line() +
scale_x_continuous(name="Donor Fraction of village (in silico mixture)", limits=c(0, 0.0105)) +
scale_y_continuous(name="Median Percent Error", limits=c(0, 100)) +
theme_minimal()
final_df_30$Coverage = "30x"
final_df_30 = subset(final_df_30, select = c(DONOR,X3,Median_Percent_Error,Coverage))
p1 + p2ggsave("images/ten_coverage.png")files <- list.files(path = "/projects/munger-lab/projects/census_seq_SSP2024/coverage_dir/0.1", pattern = "my.census.txt", recursive = TRUE, full.names = TRUE)
dirs <- dirname(files)
merged_dfs <- list()
for (dir in dirs) {
# Construct the file paths
file_path_txt <- file.path(dir, "my.census.txt")
file_path_tsv <- file.path(dir, "census_report_final.tsv")
df_txt <- read_tsv(file_path_txt, skip = 2)
df_tsv <- read_tsv(file_path_tsv, col_names=FALSE)
merged_df <- merge(df_txt, df_tsv, by.x = "DONOR", by.y = "X1", all.x = TRUE)
merged_dfs[[length(merged_dfs) + 1]] <- merged_df
}
final_df <- do.call(rbind, merged_dfs)
final_df <- final_df[-which(final_df$X3 > 0.1),]
p1 <- ggplot(final_df, aes(x = X3, y = REPRESENTATION)) +
geom_point(shape=1) +
geom_abline(slope=1, intercept=0,colour='#E41A1C') +
scale_x_continuous(name="Donor Fraction of village (in silico mixture)", limits=c(0, 0.016)) +
scale_y_continuous(name="Donor Fraction of village (inferred by Census-seq)", limits=c(0, 0.016)) +
theme_minimal()
final_df$Percent_Error = abs(final_df$REPRESENTATION - final_df$X3) / final_df$X3 * 100
final_df_0.1 = final_df %>%
group_by(X3) %>%
mutate(Median_Percent_Error = median(Percent_Error))
p2 <- ggplot(final_df_0.1, aes(x = X3, y = Median_Percent_Error)) +
geom_point() +
geom_line() +
scale_x_continuous(name="Donor Fraction of village (in silico mixture)", limits=c(0, 0.0105)) +
scale_y_continuous(name="Median Percent Error", limits=c(0, 100)) +
theme_minimal()
final_df_0.1$Coverage = "0.1x"
final_df_0.1 = subset(final_df_0.1, select = c(DONOR,X3,Median_Percent_Error,Coverage))
p1 + p2files <- list.files(path = "/projects/munger-lab/projects/census_seq_SSP2024/coverage_dir/0.2", pattern = "my.census.txt", recursive = TRUE, full.names = TRUE)
dirs <- dirname(files)
merged_dfs <- list()
for (dir in dirs) {
# Construct the file paths
file_path_txt <- file.path(dir, "my.census.txt")
file_path_tsv <- file.path(dir, "census_report_final.tsv")
df_txt <- read_tsv(file_path_txt, skip = 2)
df_tsv <- read_tsv(file_path_tsv, col_names=FALSE)
merged_df <- merge(df_txt, df_tsv, by.x = "DONOR", by.y = "X1", all.x = TRUE)
merged_dfs[[length(merged_dfs) + 1]] <- merged_df
}
final_df <- do.call(rbind, merged_dfs)
final_df <- final_df[-which(final_df$X3 > 0.1),]
p1 <- ggplot(final_df, aes(x = X3, y = REPRESENTATION)) +
geom_point(shape=1) +
geom_abline(slope=1, intercept=0,colour='#E41A1C') +
scale_x_continuous(name="Donor Fraction of village (in silico mixture)", limits=c(0, 0.016)) +
scale_y_continuous(name="Donor Fraction of village (inferred by Census-seq)", limits=c(0, 0.016)) +
theme_minimal()
final_df$Percent_Error = abs(final_df$REPRESENTATION - final_df$X3) / final_df$X3 * 100
final_df_0.2 = final_df %>%
group_by(X3) %>%
mutate(Median_Percent_Error = median(Percent_Error))
p2 <- ggplot(final_df_0.2, aes(x = X3, y = Median_Percent_Error)) +
geom_point() +
geom_line() +
scale_x_continuous(name="Donor Fraction of village (in silico mixture)", limits=c(0, 0.0105)) +
scale_y_continuous(name="Median Percent Error", limits=c(0, 100)) +
theme_minimal()
final_df_0.2$Coverage = "5x"
final_df_0.2 = subset(final_df_0.2, select = c(DONOR,X3,Median_Percent_Error,Coverage))
p1 + p2library(plotly)
dens <- with(diamonds, tapply(price, INDEX = cut, density))
data <- data.frame(
x = unlist(lapply(dens, "[[", "x")),
y = unlist(lapply(dens, "[[", "y")),
cut = rep(names(dens), each = length(dens[[1]]$x)))
fig <- plot_ly(data, x = ~x, y = ~y, z = ~cut, type = 'scatter3d', mode = 'lines', color = ~cut)
figfinal_df_ten$COVERAGE = "Ten"
final_df_one$COVERAGE = "One"
final_df_hundred$COVERAGE = "Hundred"
final_df_hundred = subset(final_df_hundred, select = c(DONOR,X2,Median_Percent_Error,COVERAGE))
final_df_ten = subset(final_df_ten, select = c(DONOR,X2,Median_Percent_Error,COVERAGE))
final_df_one = subset(final_df_one, select = c(DONOR,X2,Median_Percent_Error,COVERAGE))
massive_df = rbind(final_df_hundred,final_df_one,final_df_ten)
massive_df = as.data.frame(massive_df)
fig <- plot_ly(massive_df, x = ~X2, y = ~Median_Percent_Error, z = ~COVERAGE, type = 'scatter3d', mode = 'lines', color = ~COVERAGE)
figlibrary(plotrix)
data <- c(4, 4, 5, 6, 12, 18, 24)
pie3D(data,
col = c("#921915", "#B8514B", "#DFA5A1", "#E1E9ED", "#BFD4EA", "#6292C7", "#174694"),
shade = 0.5)